# R script for estimating national excess deaths
# Step I: Applying the sex-differential method to full-count census data

# START SCRIPT

rm(list = ls())

# Load libraries ----------------------------------------------------------

if (!require('readxl')) install.packages('readxl')

# Compute basic cohort statistics -----------------------------------------

census_years <- seq(1850, 1880, by = 10)

## Read cohort sizes ----

# cohort size for native-born white males by census year
WM_by_agegroup <- lapply(census_years, function(year) readxl::read_xlsx('DAT_Ncohort_nbwm_1850to1880.xlsx', sheet = as.character(year))) |>
  `names<-`(census_years)

WM_1850 = WM_by_agegroup$`1850`
WM_1860 = WM_by_agegroup$`1860`
WM_1870 = WM_by_agegroup$`1870`
WM_1880 = WM_by_agegroup$`1880`

# cohort size for  native-born white females by census year
WF_by_agegroup <- lapply(census_years, function(year) readxl::read_xlsx('DAT_Ncohort_nbwf_1850to1880.xlsx', sheet = as.character(year))) |>
  `names<-`(census_years)

WF_1850 = WF_by_agegroup$`1850`
WF_1860 = WF_by_agegroup$`1860`
WF_1870 = WF_by_agegroup$`1870`
WF_1880 = WF_by_agegroup$`1880`

## Compute survival rates by cohort ----

compute_survivalrate_by_agegroup = function(x1, x2) {
  rate = vector("numeric", length = 8)
  for (i in 2:9) {
    rate[i-1] <- x2[i+2] / x1[i]
  }
  names(rate) <- c('5–9', '10–14', '15–19', '20–24', '25–29', '30–34', '35–39', '40–44')
  return(rate)
}

# survival rates for nbw males
WM_1850to1860 = compute_survivalrate_by_agegroup(as.numeric(WM_1850), as.numeric(WM_1860))
WM_1860to1870 = compute_survivalrate_by_agegroup(as.numeric(WM_1860), as.numeric(WM_1870))
WM_1870to1880 = compute_survivalrate_by_agegroup(as.numeric(WM_1870), as.numeric(WM_1880))

# survival rates for nbw females
WF_1850to1860 = compute_survivalrate_by_agegroup(as.numeric(WF_1850), as.numeric(WF_1860))
WF_1860to1870 = compute_survivalrate_by_agegroup(as.numeric(WF_1860), as.numeric(WF_1870))
WF_1870to1880 = compute_survivalrate_by_agegroup(as.numeric(WF_1870), as.numeric(WF_1880))

# Compute sex differentials by cohort -------------------------------------

## Compute observed differentials ----

D_1850to1860 = WM_1850to1860 - WF_1850to1860
D_1860to1870 = WM_1860to1870 - WF_1860to1870
D_1870to1880 = WM_1870to1880 - WF_1870to1880

## Compute expected differentials ----

ED_1860to1870_1850to1860 = D_1850to1860
ED_1860to1870_1870to1880 = D_1870to1880
ED_1860to1870_AVG = (D_1850to1860+D_1870to1880)/2

## Compute excess death rates ----

DD_base1850to1860 = ED_1860to1870_1850to1860 - D_1860to1870
DD_base1870to1880 = ED_1860to1870_1870to1880 - D_1860to1870
DD_baseAVG = ED_1860to1870_AVG - D_1860to1870

# Compute death estimates -------------------------------------------------

## Compute baseline population and excess death rates ----

WM_1860_base = WM_1860[,c('10–14', '15–19', '20–24', '25–29', '30–34', '35–39', '40–44')]
DeathRate_base1850to1860 = DD_base1850to1860[c('10–14', '15–19', '20–24', '25–29', '30–34', '35–39', '40–44')]
DeathRate_base1870to1880 = DD_base1870to1880[c('10–14', '15–19', '20–24', '25–29', '30–34', '35–39', '40–44')]
DeathRate_baseAVG = DD_baseAVG[c('10–14', '15–19', '20–24', '25–29', '30–34', '35–39', '40–44')]

## Compute excess deaths I: native-born white males only ----

Death_base1850to1860 = WM_1860_base*DeathRate_base1850to1860
Death_base1850to1860_sum = sum(Death_base1850to1860) 
Death_base1870to1880 = WM_1860_base*DeathRate_base1870to1880
Death_base1870to1880_sum = sum(Death_base1870to1880) 
Death_baseAVG = WM_1860_base*DeathRate_baseAVG
Death_baseAVG_sum = sum(Death_baseAVG) 

print(sprintf('Est. total excess deaths among native-born white males: approx. %s', round(Death_baseAVG_sum)))
# "Est. total excess deaths among native-born white males: approx. 496332"

## Compute excess deaths II: including foreign-born whites ----

WM_1860_baseALL = readxl::read_xlsx('DAT_Ncohort_wm_1860.xlsx') # cohort size for all white males in 1860
DeathALL_base1850to1860 = WM_1860_baseALL*DeathRate_base1850to1860
DeathALL_base1850to1860_sum = sum(DeathALL_base1850to1860) 
DeathALL_base1870to1880 = WM_1860_baseALL*DeathRate_base1870to1880
DeathALL_base1870to1880_sum = sum(DeathALL_base1870to1880) 
DeathALL_baseAVG = WM_1860_baseALL*DeathRate_baseAVG
DeathALL_baseAVG_sum = sum(DeathALL_baseAVG) 

print(sprintf('Est. total excess deaths among all white males: approx. %s', round(DeathALL_baseAVG_sum)))
# "Est. total excess deaths among all white males: approx. 622280"

## Compute excess deaths III: accounting for 1860 census undercounts ----

WM_1860_baseALLADJ = WM_1860_baseALL/(1-0.06)
DeathALLADJ_base1850to1860 = WM_1860_baseALLADJ*DeathRate_base1850to1860
DeathALLADJ_base1850to1860_sum = sum(DeathALLADJ_base1850to1860) 
DeathALLADJ_base1870to1880 = WM_1860_baseALLADJ*DeathRate_base1870to1880
DeathALLADJ_base1870to1880_sum = sum(DeathALLADJ_base1870to1880) 
DeathALLADJ_baseAVG = WM_1860_baseALLADJ*DeathRate_baseAVG
DeathALLADJ_baseAVG_sum = sum(DeathALLADJ_baseAVG) 

print(sprintf('Est. total excess deaths accounting for 6pct undercount: approx. %s', round(DeathALLADJ_baseAVG_sum)))
# "Est. total excess deaths accounting for 6pct undercount: approx. 662000"

## Compute excess deaths IV: including black soldier deaths ----

DeathALLADJ_base1850to1860_sumBLA = DeathALLADJ_base1850to1860_sum + 36000 
DeathALLADJ_base1870to1880_sumBLA = DeathALLADJ_base1870to1880_sum + 36000 
DeathALLADJ_baseAVG_sumBLA = DeathALLADJ_baseAVG_sum + 36000 

print(sprintf('Est. total excess deaths including 36k black soldier deaths: approx. %s', round(DeathALLADJ_baseAVG_sumBLA)))
# "Est. total excess deaths including 36k black soldier deaths: approx. 698000"

# Export excess death estimates -------------------------------------------

baseline_deaths = cbind.data.frame("1850-1860" = t(Death_base1850to1860), "1870-1880" = t(Death_base1870to1880), "Average" = t(Death_baseAVG))
baseline_deaths = cbind.data.frame("Age Group" = rownames(baseline_deaths), baseline_deaths)
rownames(baseline_deaths) <- NULL
baseline_deaths = rbind.data.frame(
  baseline_deaths, 
  cbind.data.frame("Age Group" = "Total", t(c(
    Death_base1850to1860_sum,Death_base1870to1880_sum,Death_baseAVG_sum
  ) |> `names<-`(c('1850-1860', '1870-1880','Average')))))

with_foreignborn = cbind.data.frame("1850-1860" = t(DeathALL_base1850to1860), "1870-1880" = t(DeathALL_base1870to1880), "Average" = t(DeathALL_baseAVG))
with_foreignborn = cbind.data.frame("Age Group" = rownames(with_foreignborn), with_foreignborn)
rownames(with_foreignborn) <- NULL
with_foreignborn = rbind.data.frame(
  with_foreignborn, 
  cbind.data.frame("Age Group" = "Total", t(c(
    DeathALL_base1850to1860_sum,DeathALL_base1870to1880_sum,DeathALL_baseAVG_sum
  ) |> `names<-`(c('1850-1860', '1870-1880','Average')))))

with_undercount = cbind.data.frame("1850-1860" = t(DeathALLADJ_base1850to1860), "1870-1880" = t(DeathALLADJ_base1870to1880), "Average" = t(DeathALLADJ_baseAVG))
with_undercount = cbind.data.frame("Age Group" = rownames(with_undercount), with_undercount)
rownames(with_undercount) <- NULL
with_undercount = rbind.data.frame(
  with_undercount, 
  cbind.data.frame("Age Group" = "Total", t(c(
    DeathALLADJ_base1850to1860_sum,DeathALLADJ_base1870to1880_sum,DeathALLADJ_baseAVG_sum
  ) |> `names<-`(c('1850-1860', '1870-1880','Average')))))

with_blacks = cbind.data.frame(
  "Age Group" = "Total",
  t(c(DeathALLADJ_base1850to1860_sumBLA, DeathALLADJ_base1870to1880_sumBLA, DeathALLADJ_baseAVG_sumBLA) |> `names<-`(c('1850-1860', '1870-1880','Average')))
)

all_estimates = rbind.data.frame(
  cbind.data.frame("Sample" = "Native White Males Only", baseline_deaths),
  cbind.data.frame("Sample" = "All White Males", with_foreignborn),
  cbind.data.frame("Sample" = "Adjusted for undercount", with_undercount),
  cbind.data.frame("Sample" = "Including Black Deaths", with_blacks)
)
writexl::write_xlsx(all_estimates, "EST_excdeaths_national.xlsx")

# END SCRIPT HERE


